*July principals to callfire merge 
*compiled and edited by OB
*created 19 aug 2024

clear all
	cd "$data/2_field_experiment/july data"
	use ImportCallfireJuly22_ONEcall.dta
	egen group = group(Variation Treatment), label
	tabstat FemaleNum, by(group) format(%9.2fc) stat(mean N) labelwidth(32)
	
	gen MaleNum=1-FemaleNum
	gen order=.
	replace order=-2 if Treatment==1
	replace order=-1 if Treatment==4
	replace order=0 if Treatment==0
	replace order=1 if Treatment==2
	*graph bar (mean) FemaleNum (mean) MaleNum, yline(0.5) by(Variation, row(1)) over(Treatment, gap(*1) sort(order) label(angle(50))) stack bar(1, fcolor(orange) lcolor(orange)) bar(2, fcolor(ltblue%70) lcolor(ltblue))  ytitle(Proportion) title(Unmatched) ylab(,nogrid) legend(order(1 "Called Female (Mom)" 2 "Called Male (Dad)"))	
	replace order=2 if Treatment==3	
	drop MaleNum order
	//Trying to match on 10 digit number
		//Won't be able to match on 10 digit if from restricted numbers
		brow if num_fromnumber=="." //Manually cut and paste here on the "Restricted Numbers" sheet 
		drop if num_fromnumber=="."
		unique fromnumber // still some duplicates, keep 1st one in timeline
							
		sort fromnumber numtime
		bysort fromnumber: gen drop2=_n
		drop if drop2>1
		drop drop2
		//Any of following can be what we match on
		unique fromnumber
		unique num_fromnumber 		
		unique ContactNameFormat
		sort num_fromnumber
		********************************** 
		* DH add original callfire ID 
		********************************** 
		rename CallfireID CallfireID_new 
		merge 1:1 fromnumber using "$data/2_field_experiment/july data/callfireID_crosswalk_clean.dta"
		drop if _merge == 1
		drop _merge
	save CallfireJuly22_Unique_Phonenum.dta, replace
		
//2. Full dataset
clear all
cd "$data/2_field_experiment/principals data"
import delimited "full_rollout_signatures_merged_allvars_randomized"
	drop __000040 __000041
	*destring fromnumber, generate(num_fromnumber) force //DH change to not destring
	gen num_fromnumber = fromnumber // DH change
	
	**** DH remove phone cleaning here ****
		
		//Drop those with duplicates emails sent to them
	bysort emailaddress: gen dup_emails=_N	
	drop if dup_emails>1	
	//Look for multiple phonenumbers
	drop dup_phone
	bysort fromnumber: gen dup_phone=_N	
	drop if dup_phone>1
	tabstat treatmentnumber, by(treatment) format(%9.2fc) stat(N) labelwidth(32)
	sort fromnumber //72,136 emails went out to a unique phone AND unique email
	gen ContactName7=substr(fromnumber,1 , 7) 
	egen num_ContactName7 = count(ContactName7), by(ContactName7) // 72,136 occur a single time
	save secondphaseranmerged_noDups.dta, replace 
			// this dataset does not have tonumner
	
	
	
//3. Now merge the 2 datasets	
cd "$data/2_field_experiment/july data"
	recast str11 fromnumber
	merge 1:1 fromnumber using CallfireJuly22_Unique_Phonenum.dta  // matched 8,960 that is 52% of the numbers
		// 8960 matched 
		// 63,176 only in master -- same as in ided
		// 8,185 in using  -- similar as in ided
		tabstat FemaleNum if _merge==3, by(group) format(%9.2fc) stat(mean N) labelwidth(32)
		gen MaleNum=1-FemaleNum
		gen order=.
		replace order=-2 if Treatment==1
		replace order=-1 if Treatment==4
		replace order=0 if Treatment==0
		replace order=1 if Treatment==2
		replace order=2 if Treatment==3	
		*graph bar (mean) FemaleNum (mean) MaleNum if _merge==3, yline(0.5) by(Variation, row(1)) over(Treatment, gap(*1) sort(order) label(angle(50))) stack bar(1, fcolor(orange) lcolor(orange)) bar(2, fcolor(ltblue%70) lcolor(ltblue))  ytitle(Proportion) title(10 digit matched) ylab(,nogrid) legend(order(1 "Called Female (Mom)" 2 "Called Male (Dad)"))	
		drop MaleNum order	
		tostring schoolname, replace // DH add
	savesome if _merge==2 using July2022_UnmatchedFrom10.dta, replace
	savesome if _merge==3 using 00_July2022_MatchedFrom10.dta, replace //These will be in final merged datasets that I stack	
	drop if _merge==2
	gen temp_merge=_merge
	drop _merge
	savesome if temp_merge==1 using firstphaseranmerged_noDups_UnmatchedFrom10.dta, replace	//63,176 schools still yet to be matched
	save July2022_MatchCallfire2Principals_10digit.dta, replace 
		//  63,176 missing tonumber, same as ided

//---------Take the calls that were unmatched
use July2022_UnmatchedFrom10.dta, clear	
// drop treatmentnumber - dfname // drop the empty variables
drop treatmentnumber - emailaddress // drop the empty variables (DH change)
drop _merge
drop ContactName7 num_ContactName7
*gen ContactName7 = "1" + ContactName6  // DH remove 
gen ContactName7=substr(fromnumber,1 , 7) //DH add

egen num_ContactName7_C = count(ContactName7), by(ContactName7) // 662 occur a single time, rest have duplicates
unique CallfireID // 8179 unique CallfireIDs, endgoal is to ideally link each of these 8179 to a single principal email sent	
sort ContactName7
	
	//DH change -- stopped drop
*drop   dup_email lowcounty lowfullname namestatecounty dup_nsc rand dup_email_final dup_phone_final dup_nsc_merge linenumber dup_emails dup_phone

unique CallfireID // 8179 unique CallfireIDs, endgoal is to ideally link each of these 8179 to a single principal email sent	
sort ContactName7 fromnumber
	bysort ContactName7: gen num_ContactName7_line=_n
save July2022_UnmatchedFrom10_temp.dta, replace
clear



*************************
local i=1
foreach a of numlist 1/34 {
use July2022_UnmatchedFrom10_temp.dta
	keep if num_ContactName7_line==`i'
	merge 1:m ContactName7 using firstphaseranmerged_noDups_UnmatchedFrom10.dta // DH change
	keep if _merge==3
save July22_MatchCallfire6digit_`i'.dta, replace
local i=`i'+1
di `i'
clear
}

use July22_MatchCallfire6digit_1.dta
local i=2
foreach a of numlist 1/33 {
append using July22_MatchCallfire6digit_`i'.dta
local i=`i'+1
di `i'
}


	unique CallfireID if _merge==3 // 4099 unique CallfireIDs with 33739 records 
	count if num_ContactName7==1&num_ContactName7_C==1&_merge==3 // 698 are uniquely merged, but not sure they are all correct	
	gen CheckMatchLater=0
	gen MaleHigh=0
		gen MaleHigh_P=0
			replace MaleHigh_P=1 if strpos(treatment, "Curtis high")>0
		gen MaleHigh_C=(Treatment==1)
		count if MaleHigh_P==0&MaleHigh_C==1
		replace CheckMatchLater=1 if MaleHigh_P==0&MaleHigh_C==1
		replace MaleHigh=1 if strpos(treatment, "Curtis high")
	gen MaleLow=0
		gen MaleLow_P=0 
			replace MaleLow_P=1 if strpos(treatment, "Curtis low")
		gen MaleLow_C=(Treatment==2)
		replace CheckMatchLater=1 if MaleLow_P==0&MaleLow_C==1
		replace MaleLow=1 if strpos(treatment, "Curtis low")	
	gen FemaleHigh=0
		gen FemaleHigh_P=0
			replace FemaleHigh_P=1 if strpos(treatment, "Audrey high")
		gen FemaleHigh_C=(Treatment==3)
		replace CheckMatchLater=1 if FemaleHigh_P==0&FemaleHigh_C==1
		replace FemaleHigh=1 if strpos(treatment, "Audrey high")	
	gen FemaleLow=0
		gen FemaleLow_P=0
		replace FemaleLow_P=1 if strpos(treatment, "Audrey low")
		gen FemaleLow_C=(Treatment==4)
		replace CheckMatchLater=1 if FemaleLow_P==0&FemaleLow_C==1
		replace FemaleLow=1 if strpos(treatment, "Audrey low")		
	
	sort _merge CallfireID ContactName7 num_ContactName7 num_ContactName7_C CheckMatchLater	
	bysort CallfireID: gen fuzzy=_N
	sort _merge CallfireID ContactName7 num_ContactName7 num_ContactName7_C CheckMatchLater
	
	*brow CallfireID fuzzy ///
		ContactName7 num_ContactName7 num_ContactName7_C ///
		fromnumber link ///
		Variation Treatment treatment CheckMatchLater ///
		*schoolname firstname lastname locationcity locationstate
	
*export excel CallfireID /// DH change: remove some vars that have been dropped
		ContactName7 num_ContactName7 num_ContactName7_C ///
		fromnumber link ///
		Variation Treatment treatment CheckMatchLater ///
		emailaddress locationcity locationstate  ///
		num_fromnumber dup_emails dup_phone tonumber state result answertime duration Wave FemaleNum FemalePrincipal MomListedFirst ContactName6 ContactNameFormat6 ContactNameFormat date numtime phoneID PhoneNumPair DaysSinceStartingEmails NumCalls CallOrder ContactName7 num_ContactName7_C treatmentnumber treatment treatmentname genderofprincipal signature signaturenumber fromperson cc message1 message2 v*  titlesid ncesschoolid districtid districtname schoolid title emailaddress phone   locationcity locationstate locationzip  lowgrade highgrade gradestatus whetheracharterschool totalstudents nurseryprekindergartenoffered kindergartenoffered firstgradeoffered secondgradeoffered thirdgradeoffered fourthgradeoffered fifthgradeoffered sixthgradeoffered seventhgradeoffered eighthgradeoffered ninthgradeoffered tenthgradeoffered eleventhgradeoffered twelfthgradeoffered teacherstotal numberofschools virtualschoolstatus localecode metropolitanormicropolitansta totaloffreelunch* allstudents* classroomteachers titleistatus national* titleistatusprogram titleieligibleschool schoolwidetitlei schooltypology religion schoolorientation schooltype gradelevel totalnumberofstudents sizeofschoolk12ug numberofk12teachers urbancentriccommunitytype lengthofschooldayinhours proportion_male proportion_female gender year_min year_max prob_asian prob_black prob_hispanic prob_white race region_name_code_anon region_code_anon public num_ContactName7 temp_merge _merge CheckMatchLater MaleHigh MaleHigh_P MaleHigh_C MaleLow MaleLow_P FemaleHigh FemaleHigh_C FemaleLow FemaleLow_P FemaleLow_C ///
		using ForRAsToMatch_Step2_v2, sheetreplace firstrow(variables)
		clear all	
//import this sheet here on "Fuzzy Match" tab 
//------------END HERE 8/29/2022-------------------//
		

//------------START HERE 10/14/2022-------------------//	
clear all
import excel "$data/2_field_experiment/july data/ForRAsToMatch_Step2 (July 2022) v3.xlsx", sheet("Fuzzy Match") firstrow
unique MatchedEmail if CorrectMatch>0&CorrectMatch!=. //some duplicates
	//want to find the single MatchedEmail that corresponds to the 1st call from the school
sort MatchedEmail numtime
bysort MatchedEmail: gen MatchedEmail1stCall=_n
unique MatchedEmail if CorrectMatch>0&CorrectMatch!=. &MatchedEmail1stCall==1 // hopefully eliminates duplicates
		//only run code below on sheet after RAS are finished
		drop if CorrectMatch==0
		drop if CorrectMatch==.
		drop if MatchedEmail==""
		drop if MatchedEmail1stCall>1
		//only run code above on sheet after RAs are finished
	bysort CallfireID: gen check=_N // should be 1 for all
	drop if check>1
	drop check	
	//This dataset now contains 2,590 (as of 10/14/2022) lines of CallFire matched to a single School
		gen FemaleNum_num=(FemaleNum=="Female Number (Mom)")
	drop FemaleNum
	rename FemaleNum_num FemaleNum
	drop emailaddress
	rename MatchedEmail emailaddress
	//keep emailaddress FemaleNum CallfireID
	sort emailaddress CallfireID
	bysort emailaddress: gen check=_n	
	drop if check>1 //keeps the 1st call made by the emailaddress
	drop check
	append using "$data/2_field_experiment/july data/00_July2022_MatchedFrom10.dta", force
	keep emailaddress FemaleNum CallfireID // 10,677 (as of 10/14/2022) unique email addresses matched
	unique emailaddress
	unique CallfireID
	cd "$data/2_field_experiment/principals data"
	merge 1:1 emailaddress using secondphaseranmerged_noDups.dta
	
	//if RAs have only done 6 digit match run below
export excel using "ForRAstoMatchStep3_10142022" if _merge!=3, firstrow(variables) replace
	//Cut and paste most important columns to the google doc
	//if RAs have only done 6 digit match run above

	
	
//------------Start here 11/17/2022---------------//
clear all
//School Dataset: 72,136 schools
	cd "$data/2_field_experiment/principals data"
	use secondphaseranmerged_noDups.dta, replace //72,136
	clear
//Call Datasets	
	//A. 10 digit matches: 8,960 10 digit phone-call:school matches
	cd "$data/2_field_experiment/july data"
	use 00_July2022_MatchedFrom10.dta // 8,960
	unique lemail
	unique CallfireID
	keep CallfireID lemail numtime
	save 00_July2022_MatchedFrom10_crosswalk.dta, replace //saves CallfireID lemail
	clear
	//Spreadsheet from RAs which includes all other matches
	//B. Restricted Numbers, started with 408 phone calls, matched 242 to a school but sometimes same school has 2+ calls, so 173 unique schools
	clear
	cd "$data/2_field_experiment/july data"
	import excel "ForRAsToMatch_Step2 (July 2022) v3.xlsx", sheet("Restricted Numbers") firstrow
	keep if CorrectMatch==1 // 242
	drop AM - FL
	save 01_July2022_MatchedRestricted.dta, replace
	gen email=subinstr(MatchedEmail, " ", "", .)
	gen lemail=lower(email)
	sort lemail numtime
	bysort lemail: gen Keep1st=_n
	keep if Keep1st==1
	unique lemail
	unique CallfireID
	keep CallfireID lemail numtime
	save 01_July2022_MatchedRestricted_crosswalk.dta, replace
	//C. Fuzzy Match (aka 6 digit match), started with 4099 unique phone calls, matched 2655 to a school but sometimes same school has 2+ calls, so 2591 unique schools
	clear
	cd "$data/2_field_experiment/july data"
	import excel "ForRAsToMatch_Step2 (July 2022) v3.xlsx", sheet("Fuzzy Match") firstrow	
	unique CallfireID
	keep if CorrectMatch==1
	save "$data/2_field_experiment/july data/02_July2022_MatchedFuzzy.dta", replace
	gen email=subinstr(MatchedEmail, " ", "", .)
	gen lemail=lower(email)
	drop if lemail==""
	sort lemail numtime
	bysort lemail: gen Keep1st=_n
	keep if Keep1st==1
	unique lemail
	unique CallfireID
	keep CallfireID lemail numtime
	save "$data/2_field_experiment/july data/02_July2022_MatchedFuzzy_crosswalk.dta", replace
	//D. NoMatch, started with 4079 unique phone calls, matched 2,712 to a school but sometimes same school has 2+ calls, so 2650 unique schools
	clear
	cd "$data/2_field_experiment/july data"
	import excel "ForRAsToMatch_Step2 (July 2022) v3.xlsx", sheet("No Match") firstrow	
	unique CallfireID
	drop if CallfireID==.
	keep if CorrectMatch==1
	save "$data/2_field_experiment/july data/03_July2022_MatchedNoMatch.dta", replace
	gen email=subinstr(MatchedEmail, " ", "", .)
	gen lemail=lower(email)
	drop if lemail==""
	sort lemail numtime
	bysort lemail: gen Keep1st=_n
	keep if Keep1st==1
	unique lemail
	unique CallfireID
	keep CallfireID lemail numtime
	save "$data/2_field_experiment/july data/03_July2022_MatchedNoMatch_crosswalk.dta", replace
	//D. Notes, started with 738 unique phone calls, matched 564 to a school but sometimes same school has 2+ calls, so 544 unique schools
	clear
	cd "$data/2_field_experiment/july data"
	import excel "ForRAsToMatch_Step2 (July 2022) v3.xlsx", sheet("Notes") firstrow	
	unique CallfireID
	drop if CallfireID==.
	keep if CorrectMatch==1
	save 04_July2022_MatchedNotes.dta, replace
	gen email=subinstr(MatchedEmail, " ", "", .)
	gen lemail=lower(email)
	drop if lemail==""
	drop date numtime
	gen date=substr(answertime, 1,10)+" "+ substr(answertime, 12,8)
	generate double numtime = clock(date, "YMDhms")	
	format numtime %tc		
	sort lemail numtime
	bysort lemail: gen Keep1st=_n
	keep if Keep1st==1
	unique lemail
	unique CallfireID
	keep CallfireID lemail numtime
	save "$data/2_field_experiment/july data/04_July2022_MatchedNotes_crosswalk.dta", replace
	//STACK crosswalks
	clear all
	use 00_July2022_MatchedFrom10_crosswalk.dta
	gen MatchType="10 digit"	
	append using 01_July2022_MatchedRestricted_crosswalk.dta
	replace MatchType="Restricted" if MatchType==""	
	append using 02_July2022_MatchedFuzzy_crosswalk.dta
	replace MatchType="Fuzzy" if MatchType==""
	append using 03_July2022_MatchedNoMatch_crosswalk.dta
	replace MatchType="No Match" if MatchType==""
	append using 04_July2022_MatchedNotes_crosswalk.dta
	replace MatchType="Notes" if MatchType==""
	unique lemail //14551
	unique CallfireID //14931
	bysort lemail: gen check=_N
	drop check
	sort lemail numtime
	bysort lemail: gen Keep1st=_n
	keep if Keep1st==1
	unique lemail
	unique CallfireID	
	sort lemail
	drop Keep1st
	rename numtime numtime_crosswalk
	save "$data/2_field_experiment/july data/July2022_All_crosswalk.dta", replace

//School Dataset: 
	clear all
	cd "$data/2_field_experiment/principals data"

use secondphaseranmerged_noDups.dta, clear
cd "$data/2_field_experiment/july data"
merge 1:1 lemail using July2022_All_crosswalk.dta 

drop _merge
// Drop observations with missing CallfireID
rename CallfireID CallfireID_new 
merge 1:1 fromnumber using "$data/2_field_experiment/july data/callfireID_crosswalk2_clean.dta" // all matched
drop _merge

preserve
keep if CallfireID==.
save temp_missing_callfire.dta, replace
restore
drop if CallfireID==. 
//now do the merge
joinby CallfireID using CallfireJuly22_Unique_Phonenum.dta, unmatched(master) 
				// Unique 
append using temp_missing_callfire.dta 
	replace Wave=5 if Wave==.
	gen CallfireID_Old=CallfireID
	replace CallfireID=CallfireID+100000
	cd "$data/2_field_experiment/july data"
	save July2022_MatchedALL.dta, replace
	
	//Below should be moved into "MergedDataSets"
	cd "$data/2_field_experiment/may data"
	tostring phoneID, replace
	append using May2022_MatchedALL.dta
	replace Wave=4 if Wave==.	
	cd "$data/2_field_experiment/final_data"
	
	rename Variation VariationOld
	gen Variation=.
	replace Variation=0 if strpos(treatment, "Main")&Variation==.
	replace Variation=1 if strpos(treatment, "Payment")&Variation==.
	replace Variation=2 if strpos(treatment, "Constant")&Variation==.
	replace Variation=3 if strpos(treatment, "FT")&Variation==.
	
	rename Treatment TreatmentOld
	gen Treatment=.
	replace Treatment=1 if strpos(treatment, "Curtis high")&Treatment==.
	replace Treatment=2 if strpos(treatment, "Curtis low")&Treatment==.
	replace Treatment=3 if strpos(treatment, "Audrey high")&Treatment==.
	replace Treatment=4 if strpos(treatment, "Audrey low")&Treatment==.	
	replace Treatment=1 if strpos(treatment, "Roy high")&Treatment==.
	replace Treatment=2 if strpos(treatment, "Roy low")&Treatment==.
	replace Treatment=3 if strpos(treatment, "Erica high")&Treatment==.
	replace Treatment=4 if strpos(treatment, "Erica low")&Treatment==.		
	replace Treatment=0 if Treatment==.
	
	drop WhoCalled
	gen WhoCalled=.
	replace WhoCalled=0 if FemaleNum==.
	replace WhoCalled=1 if FemaleNum==1
	replace WhoCalled=2 if FemaleNum==0

	label values WhoCalled WhoCalledl	
	
drop NoCall FemaleNum0 MaleNum0 MaleNum
gen NoCall=0
	replace NoCall=1 if FemaleNum==.
gen FemaleNum0=FemaleNum
	replace FemaleNum0=0 if FemaleNum==.
gen MaleNum=1-FemaleNum	
gen MaleNum0=MaleNum
	replace MaleNum0=0 if MaleNum==.

label values Variation Variationl
label values Treatment Treatl

gen FemaleEmail=(fromperson=="Erica")|(fromperson=="audrey@the-johnsonfamily.net")
**# Bookmark #1
	label define FemaleEmaill ///
		1 "@female" ///
		0 "@male"
	label values FemaleEmail FemaleEmaill	

//Ideal: Equal number from FemaleEmail by Variation/Treatment
//Reality: imbalances on FemailEmail 
//drop group 
//egen group = group(Variation Treatment), label		

		*run "weighting.do"
		
save "$data/2_field_experiment/final_data/MayJuly2022_MatchedALL.dta", replace 
